In [2]:
from __future__ import print_function
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import zipfile
from IPython.display import display, Image
from scipy import ndimage
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from skimage import color, io
from scipy.misc import imresize
np.random.seed(133)
# Config the matplotlib backend as plotting inline in IPython
%matplotlib inline
IMAGE_SIZE = 224
First, load the data from the Kaggle
In [2]:
last_percent_reported = None
def download_progress_hook(count, blockSize, totalSize):
"""A hook to report the progress of a download. This is mostly intended for users with
slow internet connections. Reports every 1% change in download progress.
"""
global last_percent_reported
percent = int(count * blockSize * 100 / totalSize)
if last_percent_reported != percent:
if percent % 5 == 0:
sys.stdout.write("%s%%" % percent)
sys.stdout.flush()
else:
sys.stdout.write(".")
sys.stdout.flush()
last_percent_reported = percent
def maybe_download(filename, url, expected_bytes, force=False):
"""Download a file if not present, and make sure it's the right size."""
if force or not os.path.exists(filename):
print('Attempting to download:', filename)
filename, _ = urlretrieve(url , filename, reporthook=download_progress_hook)
print('\nDownload Complete!')
statinfo = os.stat(filename)
if statinfo.st_size:# == expected_bytes:
print('Found and verified', filename)
else:
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
test_filename = maybe_download('test.zip', 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/5441/test.zip?sv=2015-12-11&sr=b&sig=YtsCaH8gL7dObP11aL7iD9VVaJ%2BGtnls3%2FzBiE8vfjE%3D&se=2017-02-26T15%3A55%3A36Z&sp=r', 71303168)
train_filename = maybe_download('train.zip', 'https://kaggle2.blob.core.windows.net/competitions-data/kaggle/5441/train.zip?sv=2015-12-11&sr=b&sig=7UzYtGnmwvxodWZtaMVFjzmfSXLUM%2FMVjOpmtBZId28%3D&se=2017-02-26T16%3A02%3A58Z&sp=r', 566181888)
Extract the images
In [3]:
def maybe_extract(filename, force=False):
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
if os.path.isdir(root) and not force:
# You may override by setting force=True.
print('%s already present - Skipping extraction of %s.' % (root, filename))
else:
print('Extracting data from %s. This may take a while. Please wait.' % filename)
zip = zipfile.ZipFile(filename)
sys.stdout.flush()
zip.extractall()
zip.close()
return root+'/'
train_folder = maybe_extract(train_filename)
test_folder = maybe_extract(test_filename)
Build lists of files and display some random image to verify if it works
In [4]:
train_images = [train_folder+i for i in os.listdir(train_folder)]
#train_labels = ['dog' in i for i in train_images]
#train_dogs = [train_folder+i for i in os.listdir(train_folder) if 'dog' in i]
#train_cats = [train_folder+i for i in os.listdir(train_folder) if 'cat' in i]
test_images = [test_folder+i for i in os.listdir(test_folder)]
random_image=np.random.choice(train_images)
print (random_image)
image=Image(random_image)
display(image)
random_image=np.random.choice(test_images)
print (random_image)
display(Image(random_image))
Now let's see what the images look like. Let's examin their shapes
In [5]:
from PIL import Image as image
dimensions_train = np.matrix([image.open(i).size for i in train_images],dtype=np.float32)
dimensions_test = np.matrix([image.open(i).size for i in test_images])
print(dimensions_train.shape)
print(dimensions_test.shape)
In [6]:
aspect_train = dimensions_train[:,0]/dimensions_train[:,1]
#print(aspect_train)
print ("Training set:")
print ("min: %s" % np.min(dimensions_train, axis=0))
print ("max: %s" % np.max(dimensions_train, axis=0))
print ("mean: %s" % np.mean(dimensions_train, axis=0))
print ("median: %s" % np.median(dimensions_train, axis=0))
print ("stdev: %s" % np.std(dimensions_train, axis=0))
print ("aspect min: %s" % np.min(aspect_train))
print ("aspect max: %s" % np.max(aspect_train))
print ("aspect mean: %s" % np.mean(aspect_train))
print ("aspect stdev: %s" % np.std(aspect_train))
print ("Test set:")
print ("min: %s" % np.min(dimensions_test, axis=0))
print ("max: %s" % np.max(dimensions_test, axis=0))
print ("mean: %s" % np.mean(dimensions_test, axis=0))
print ("median: %s" % np.median(dimensions_test, axis=0))
print ("stdev: %s" % np.std(dimensions_test, axis=0))
In [7]:
plt.hist(aspect_train, bins='auto', log=True)
plt.title("Aspect ratio Histogram (log scale)")
plt.show()
In [8]:
low_pct_aspect=np.percentile(aspect_train,0.2)
high_pct_aspect=np.percentile(aspect_train,99.8)
# empirically set aspect cutoff to 1:2
low_pct_aspect=0.5
high_pct_aspect=2.0
low_pct_aspect_indices=[i for i in xrange(len(aspect_train)) if aspect_train[i]<low_pct_aspect]
high_pct_aspect_indices=[i for i in xrange(len(aspect_train)) if aspect_train[i]>high_pct_aspect]
In [9]:
print(low_pct_aspect_indices)
def display_train_image_by_idx(idx):
display(Image(train_images[idx]))
for i in low_pct_aspect_indices:
display_train_image_by_idx(i)
print(aspect_train[i])
print(train_images[i])
In [10]:
print (high_pct_aspect_indices)
for i in high_pct_aspect_indices:
display_train_image_by_idx(i)
print(aspect_train[i])
print(train_images[i])
In [11]:
low_pct_dimension=np.percentile(dimensions_train,3,0)
# array([ 163., 150.])
small_images_indices=[i for i in xrange(len(dimensions_train))
if dimensions_train[i,0]<low_pct_dimension[0] or dimensions_train[i,1]<low_pct_dimension[1]]
print (small_images_indices)
for i in small_images_indices:
display_train_image_by_idx(i)
print(dimensions_train[i])
print(train_images[i])
In [12]:
#sanitize train images
train_images_sane = [train_images[i] for i in xrange(len(train_images))
if i not in low_pct_aspect_indices and i not in high_pct_aspect_indices and i not in small_images_indices]
In [13]:
print ("Filtered images: %s+%s+%s" % (len(low_pct_aspect_indices), len(high_pct_aspect_indices), len(small_images_indices)))
insane_indices=np.union1d(np.union1d(low_pct_aspect_indices,small_images_indices),high_pct_aspect_indices)
print ("Filtered in total: ", len(insane_indices))
print ("Remaining images in trainign set:", len(train_images_sane))
In [22]:
from scipy.ndimage.filters import gaussian_filter
def show_image(filepath):
img = io.imread(filepath)
new_img = np.array(imresize(img, (IMAGE_SIZE, IMAGE_SIZE), interp='bicubic'))
plt.imshow(new_img)
plt.show()
#plt.imshow(gaussian_filter(new_img,sigma=1))
#plt.show()
plt.imshow(np.fliplr(new_img))
plt.show()
plt.imshow(np.rot90(new_img))
plt.show()
return new_img
img=show_image(train_images_sane[1223])
print(train_images_sane[1223])
img=show_image(train_images[24969])
print(train_images[24969])
img.shape
Out[22]:
In [39]:
pickle_file = 'catfish.pickle'
try:
f = open(pickle_file, 'wb')
save = {
'train_images': train_images,
'train_images_sane': train_images_sane,
'insane_indices': insane_indices,
'test_images': test_images,
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
except Exception as e:
print('Unable to save data to', pickle_file, ':', e)
raise
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)
In [15]:
def read_image(filepath):
img = io.imread(filepath)
new_img = np.array(imresize(img, (IMAGE_SIZE, IMAGE_SIZE), interp='bicubic'))
return new_img
X_train = np.array([read_image(train_images_sane[i]) for i in xrange(len(train_images_sane))])
Y_train = np.array([int('dog' in train_images_sane[i]) for i in xrange(len(train_images_sane))])
X_train = X_train.astype('float32')
X_train /= 255
In [16]:
X_verify = np.array([read_image(train_images[i]) for i in insane_indices])
Y_verify = np.array([int('dog' in train_images[i]) for i in insane_indices])
X_verify = X_verify.astype('float32')
X_verify /= 255
In [23]:
def randomize(dataset, labels):
permutation = np.random.permutation(labels.shape[0])
shuffled_dataset = dataset[permutation,:,:]
shuffled_labels = labels[permutation]
return shuffled_dataset, shuffled_labels
#X_train, Y_train = randomize(X_train, Y_train)
#X_verify, Y_verify = randomize(X_verify, Y_verify)
In [31]:
random_image=np.random.choice(len(train_images_sane))
plt.imshow(X_train[random_image])
print(Y_train[random_image])
In [33]:
random_image=np.random.choice(len(insane_indices))
plt.imshow(X_verify[random_image])
print(Y_verify[random_image])
In [1]:
from PIL import Image
image = image.resize((1600, 900), PIL.Image.LANCZOS)
In [ ]: